Challenge 9: Baby Names

Author

Luke Thilmony

Setup

library(tidyverse)
library(knitr)
library(DT)
names_a <- read_csv(here::here('supporting_artifacts', 'StateNames_A.csv'))

Dataset Preview

datatable(names_a, class = 'cell-border stripe')
Warning in instance$preRenderHook(instance): It seems your data is too big
for client-side DataTables. You may consider server-side processing: https://
rstudio.github.io/DT/server.html

3 Summarizing and Visualizing

1.

names_allison <- names_a |> 
  filter(
    Name == 'Allison'
  ) |> 
  group_by(
    State,
    Gender
  ) |> 
  summarize(sum = sum(Count)) |> 
  pivot_wider(
    names_from = Gender,
    values_from = sum
  ) |> 
  mutate(
    M = replace_na(M, 0)
  ) |> 
  rename(
    "Number of Female Babies" = `F`,
    "Number of Male Babies" = M
  )
`summarise()` has grouped output by 'State'. You can override using the
`.groups` argument.
kable(names_allison, 'pipe')
State Number of Female Babies Number of Male Babies
AK 232 0
AL 1535 0
AR 1198 0
AZ 1880 0
CA 12413 0
CO 1594 0
CT 1099 0
DC 321 0
DE 294 0
FL 4455 0
GA 3257 0
HI 183 0
IA 1477 0
ID 451 0
IL 5110 0
IN 3067 0
KS 1283 0
KY 1905 20
LA 1209 0
MA 2218 0
MD 2229 0
ME 340 0
MI 4014 0
MN 2374 0
MO 2882 0
MS 817 0
MT 226 0
NC 3435 0
ND 285 0
NE 807 0
NH 412 0
NJ 3052 0
NM 399 0
NV 729 0
NY 5747 0
OH 5487 0
OK 1421 0
OR 1186 0
PA 4307 0
RI 306 0
SC 1228 0
SD 376 0
TN 2488 0
TX 10192 0
UT 1125 0
VA 3220 0
VT 135 0
WA 1956 0
WI 2367 0
WV 813 0
WY 142 0
datatable(names_allison, class = 'cell-border stripe')

2.

names_allison <- names_allison |> 
  select(
    State,
    `Number of Female Babies`
  )

3.

allison_years <- names_a |> 
  filter(
    Name == 'Allison'
  ) |> 
  group_by(
    Year
  ) |> 
  summarize(
    sum = sum(Count)
  )

allison_years |> ggplot(
  mapping = aes(x = Year, y = sum)
) + 
  geom_line() +
  labs(
    y = element_blank(),
    title = 'Frequency of Babies Named "Allison"'
  )

4 Modeling the Number of Allisons

4.

allison_model <- allison_years |> 
  lm(
    sum ~ Year,
    data = _
  )

5.

allison_model |> 
  ggplot(
    mapping = aes(x = Year, y = sum)
  ) +
  geom_point() +
  stat_smooth(method = 'lm') +
  labs(
    y = 'Number of Babies Named Allison'
  )
`geom_smooth()` using formula = 'y ~ x'

6.

predicted countallison = 209815.1 - 101.6(Year)

7.

allison_model |> 
  broom::augment() |> 
  ggplot(
    mapping = aes(y = .resid, x = .fitted)
  ) +
  geom_point() +
  labs(
    x = 'Predicted',
    title = 'Residual Plot',
    y = 'Residuals'
  ) +
  geom_hline(
    yintercept = 0,
    linetype = 'dashed'
  )

There are no clear trends in the residual plot but there is a group of predicted counts from around 5900 to 6600 that were all underestimated.

8.

Your name is declining in coolness/popularity.

5 Spelling by State

1.

names_allan <- names_a |> 
  filter(
    Name == 'Allan' | Name == 'Alan' | Name == 'Allen'
  )

2.

names_allan <- names_allan |> 
  filter(
    State == 'CA' | State == 'PA',
    Year == '2000'
  ) |> 
  group_by(
    State,
    Name
  ) |> 
  summarize(
    sum = sum(Count)
  ) |> 
  pivot_wider(
    names_from = Name,
    values_from = sum
  ) |> 
  rename(
    'Frequency of "Alan"' = Alan,
    'Frequency of "Allan"' = Allan,
    'Frequency of "Allen"' = Allen
  )
`summarise()` has grouped output by 'State'. You can override using the
`.groups` argument.
kable(names_allan, 'pipe')
State Frequency of “Alan” Frequency of “Allan” Frequency of “Allen”
CA 584 131 176
PA 51 12 56

3.

names_allan_pct <- names_allan |> 
  mutate(
    total = sum(
      `Frequency of "Alan"`, 
      `Frequency of "Allan"`, 
      `Frequency of "Allen"`
    ),
    `Percent "Alan"` = (`Frequency of "Alan"` / total) * 100,
    `Percent "Allan"` = (`Frequency of "Allan"` / total) * 100,
    `Percent "Allen"` = (`Frequency of "Allen"` / total) * 100
  ) |> 
  select(
    -total,
    -`Frequency of "Alan"`,
    -`Frequency of "Allan"`,
    -`Frequency of "Allen"`
  )

kable(names_allan_pct, 'pipe')
State Percent “Alan” Percent “Allan” Percent “Allen”
CA 65.54433 14.70258 19.75309
PA 42.85714 10.08403 47.05882

In CA, 65.54% were named Alan, 14.7% were named Allan, and 19.75% were named Allen. In PA, 42.86% were named Alan, 10.08% were named Allan, and 47.06% were named Allen.